{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import traceback\n",
    "\n",
    "pair_path = 'esmall_pairs.json'\n",
    "bugrepo_path = 'esmall_clear.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_pair_frame(file_path):\n",
    "    dicList=[json.loads(line) for line in open(file_path)]\n",
    "    res_list = []\n",
    "    for dic in dicList:\n",
    "        res_list.append([dic['bug1'], dic['bug2'], dic['dec']])\n",
    "    res_list = pd.DataFrame(res_list, columns=['bug1', 'bug2', 'dec'])\n",
    "    return res_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bug1</th>\n",
       "      <th>bug2</th>\n",
       "      <th>dec</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>214301</td>\n",
       "      <td>214611</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>214623</td>\n",
       "      <td>214825</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>214445</td>\n",
       "      <td>214451</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>214466</td>\n",
       "      <td>214452</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>214181</td>\n",
       "      <td>214620</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     bug1    bug2  dec\n",
       "0  214301  214611    1\n",
       "1  214623  214825   -1\n",
       "2  214445  214451    1\n",
       "3  214466  214452    1\n",
       "4  214181  214620   -1"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pairs = get_pair_frame(pair_path)\n",
    "pairs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def extract_bugrepo(file_path):\n",
    "    '''\n",
    "        Json like this\n",
    "        {\n",
    "            \"_id\"         :{\"$oid\":\"52e9a43354dc1c24f597bef8\"},\n",
    "            \"bug_id\"      :\"214065\",\n",
    "            \"product\"     :\"BIRT\",\n",
    "            \"description\" :\"Description:\\n[Regression] ...\",\n",
    "            \"bug_severity\":\"normal\",\n",
    "            \"dup_id\"      :[],\n",
    "            \"short_desc\"  :\"[Regression]Group TOC are create ... PDF\",\n",
    "            \"priority\"    :\"P3\",\n",
    "            \"version\"     :\"2.3.0\",\n",
    "            \"component\"   :\"Report Engine\",\n",
    "            \"delta_ts\"    :\"2008-01-02 21:38:46 -0500\",\n",
    "            \"bug_status\"  :\"CLOSED\",\n",
    "            \"creation_ts\" :\"2008-01-02 00:34:00 -0500\",\n",
    "            \"resolution\"  :\"FIXED\"\n",
    "        }\n",
    "    '''\n",
    "    dicList=[json.loads(line) for line in open(file_path)]\n",
    "    res_list = []\n",
    "    for dic in dicList:\n",
    "        res_list.append([dic['bug_id'], dic['product'], dic['description'], dic['bug_severity'],\\\n",
    "                         dic['dup_id'], dic['short_desc'], dic['priority'], dic['version'],\\\n",
    "                         dic['component'], dic['delta_ts'], dic['bug_status'], dic['creation_ts'],\\\n",
    "                         dic['resolution']\n",
    "                        ])\n",
    "    res_list = pd.DataFrame(res_list, columns=['bug_id', 'product', 'description', 'bug_severity',\n",
    "                                               'dup_id', 'summary', # change short_desc to summary\n",
    "                                               'priority', 'version', 'component', 'delta_ts', 'bug_status',\n",
    "                                               'creation_ts', 'resolution'\n",
    "                                              ])\n",
    "    return res_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bug_id</th>\n",
       "      <th>product</th>\n",
       "      <th>description</th>\n",
       "      <th>bug_severity</th>\n",
       "      <th>dup_id</th>\n",
       "      <th>summary</th>\n",
       "      <th>priority</th>\n",
       "      <th>version</th>\n",
       "      <th>component</th>\n",
       "      <th>delta_ts</th>\n",
       "      <th>bug_status</th>\n",
       "      <th>creation_ts</th>\n",
       "      <th>resolution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>214065</td>\n",
       "      <td>BIRT</td>\n",
       "      <td>Description:\\n[Regression]Group TOC are create...</td>\n",
       "      <td>normal</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Regression]Group TOC are created automaticall...</td>\n",
       "      <td>P3</td>\n",
       "      <td>2.3.0</td>\n",
       "      <td>Report Engine</td>\n",
       "      <td>2008-01-02 21:38:46 -0500</td>\n",
       "      <td>CLOSED</td>\n",
       "      <td>2008-01-02 00:34:00 -0500</td>\n",
       "      <td>FIXED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>214070</td>\n",
       "      <td>BIRT</td>\n",
       "      <td>Output column page in data set editor used Res...</td>\n",
       "      <td>normal</td>\n",
       "      <td>[]</td>\n",
       "      <td>ResultSetColumnHandle should not be cached in ...</td>\n",
       "      <td>P3</td>\n",
       "      <td>2.3.0</td>\n",
       "      <td>Data</td>\n",
       "      <td>2008-01-02 04:14:41 -0500</td>\n",
       "      <td>RESOLVED</td>\n",
       "      <td>2008-01-02 01:55:00 -0500</td>\n",
       "      <td>FIXED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>214068</td>\n",
       "      <td>BIRT</td>\n",
       "      <td>Description:\\n[Regression]Failed to preview Ch...</td>\n",
       "      <td>critical</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Regression]Failed to preview Chart Viewer Exa...</td>\n",
       "      <td>P3</td>\n",
       "      <td>2.3.0</td>\n",
       "      <td>Build</td>\n",
       "      <td>2008-01-02 04:32:33 -0500</td>\n",
       "      <td>CLOSED</td>\n",
       "      <td>2008-01-02 01:35:00 -0500</td>\n",
       "      <td>FIXED</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>214072</td>\n",
       "      <td>BIRT</td>\n",
       "      <td>Description:\\n  Exception is thrown out when l...</td>\n",
       "      <td>normal</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Automation]Exception is thrown out when link ...</td>\n",
       "      <td>P3</td>\n",
       "      <td>2.3.0</td>\n",
       "      <td>Report Engine</td>\n",
       "      <td>2008-01-02 21:42:39 -0500</td>\n",
       "      <td>CLOSED</td>\n",
       "      <td>2008-01-02 02:10:00 -0500</td>\n",
       "      <td>WORKSFORME</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>214071</td>\n",
       "      <td>Platform</td>\n",
       "      <td>Build ID: M20070921-1145\\n\\nSteps To Reproduce...</td>\n",
       "      <td>normal</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Help] About eclipse help pop-up information d...</td>\n",
       "      <td>P3</td>\n",
       "      <td>3.3.1</td>\n",
       "      <td>SWT</td>\n",
       "      <td>2009-01-23 15:01:34 -0500</td>\n",
       "      <td>RESOLVED</td>\n",
       "      <td>2008-01-02 01:58:00 -0500</td>\n",
       "      <td>WORKSFORME</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   bug_id   product                                        description  \\\n",
       "0  214065      BIRT  Description:\\n[Regression]Group TOC are create...   \n",
       "1  214070      BIRT  Output column page in data set editor used Res...   \n",
       "2  214068      BIRT  Description:\\n[Regression]Failed to preview Ch...   \n",
       "3  214072      BIRT  Description:\\n  Exception is thrown out when l...   \n",
       "4  214071  Platform  Build ID: M20070921-1145\\n\\nSteps To Reproduce...   \n",
       "\n",
       "  bug_severity dup_id                                            summary  \\\n",
       "0       normal     []  [Regression]Group TOC are created automaticall...   \n",
       "1       normal     []  ResultSetColumnHandle should not be cached in ...   \n",
       "2     critical     []  [Regression]Failed to preview Chart Viewer Exa...   \n",
       "3       normal     []  [Automation]Exception is thrown out when link ...   \n",
       "4       normal     []  [Help] About eclipse help pop-up information d...   \n",
       "\n",
       "  priority version      component                   delta_ts bug_status  \\\n",
       "0       P3   2.3.0  Report Engine  2008-01-02 21:38:46 -0500     CLOSED   \n",
       "1       P3   2.3.0           Data  2008-01-02 04:14:41 -0500   RESOLVED   \n",
       "2       P3   2.3.0          Build  2008-01-02 04:32:33 -0500     CLOSED   \n",
       "3       P3   2.3.0  Report Engine  2008-01-02 21:42:39 -0500     CLOSED   \n",
       "4       P3   3.3.1            SWT  2009-01-23 15:01:34 -0500   RESOLVED   \n",
       "\n",
       "                 creation_ts  resolution  \n",
       "0  2008-01-02 00:34:00 -0500       FIXED  \n",
       "1  2008-01-02 01:55:00 -0500       FIXED  \n",
       "2  2008-01-02 01:35:00 -0500       FIXED  \n",
       "3  2008-01-02 02:10:00 -0500  WORKSFORME  \n",
       "4  2008-01-02 01:58:00 -0500  WORKSFORME  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bug_repos = extract_bugrepo(bugrepo_path)\n",
    "bug_repos.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gen(col, bug_repos, pairs):\n",
    "    '''\n",
    "        chose specific column\n",
    "    '''\n",
    "    res = []\n",
    "    for i, r in pairs.iterrows():\n",
    "        try:\n",
    "            res.append([\n",
    "                    bug_repos[bug_repos['bug_id'] == str(r['bug1'])][col].values[0],\n",
    "                    bug_repos[bug_repos['bug_id'] == str(r['bug2'])][col].values[0],\n",
    "                    r['dec']\n",
    "                ])\n",
    "        except:\n",
    "            print(traceback.print_exc())\n",
    "    res = pd.DataFrame(res, columns=[col + '_bug1', col + '_bug2', 'dec'])\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>summary_bug1</th>\n",
       "      <th>summary_bug2</th>\n",
       "      <th>dec</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[update] could not load tasklist hyperlink det...</td>\n",
       "      <td>[update] Sometimes but not selden i get the er...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>WSE hangs in external browser after invoking f...</td>\n",
       "      <td>Loading model aborts on non-fatal error</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[Regression]&lt;Select value...&gt; can not select a...</td>\n",
       "      <td>Select value in table filter condition panel d...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Group completion options issue</td>\n",
       "      <td>[Group Code Assist] No code completion for und...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Add org.apache.bcel</td>\n",
       "      <td>[api tooling] comments from Eugene</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Support cube filter in chart</td>\n",
       "      <td>add API-3.0 and other common tags to project s...</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Max Rydahl Andersen's blog feed contains comments</td>\n",
       "      <td>Move my feed from blog.xam.dk to in.relation.to</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>[Regression] Highlight can not be added and th...</td>\n",
       "      <td>Submitting task fails with invalid date / time...</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Notification e-mails not sent for committer el...</td>\n",
       "      <td>[Regression] The error is of no default value ...</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>unable to reassign tasks if no permissions to ...</td>\n",
       "      <td>cmdbf services make eclipse-specific references</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        summary_bug1  \\\n",
       "0  [update] could not load tasklist hyperlink det...   \n",
       "1  WSE hangs in external browser after invoking f...   \n",
       "2  [Regression]<Select value...> can not select a...   \n",
       "3                     Group completion options issue   \n",
       "4                                Add org.apache.bcel   \n",
       "5                       Support cube filter in chart   \n",
       "6  Max Rydahl Andersen's blog feed contains comments   \n",
       "7  [Regression] Highlight can not be added and th...   \n",
       "8  Notification e-mails not sent for committer el...   \n",
       "9  unable to reassign tasks if no permissions to ...   \n",
       "\n",
       "                                        summary_bug2  dec  \n",
       "0  [update] Sometimes but not selden i get the er...    1  \n",
       "1            Loading model aborts on non-fatal error   -1  \n",
       "2  Select value in table filter condition panel d...    1  \n",
       "3  [Group Code Assist] No code completion for und...    1  \n",
       "4                 [api tooling] comments from Eugene   -1  \n",
       "5  add API-3.0 and other common tags to project s...   -1  \n",
       "6    Move my feed from blog.xam.dk to in.relation.to    1  \n",
       "7  Submitting task fails with invalid date / time...   -1  \n",
       "8  [Regression] The error is of no default value ...   -1  \n",
       "9    cmdbf services make eclipse-specific references   -1  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary = gen('summary', bug_repos, pairs)\n",
    "summary.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pairs.to_csv('esmall_pairs.csv', index=False, encoding='GB18030')\n",
    "bug_repos.to_csv('esmall_bug_repos.csv', index=False, encoding='GB18030')\n",
    "summary.to_csv('esmall_summary.csv', index=False, encoding='GB18030')"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
